{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "VOCAB_SIZE = 20000\n",
    "BATCH_SIZE = 32\n",
    "N_EPOCH = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sparse_tfidf(X):\n",
    "    t0 = time.time()\n",
    "    count = np.zeros((len(X), VOCAB_SIZE))\n",
    "    for i, indices in enumerate(X):\n",
    "        for idx in indices:\n",
    "            count[i, idx] += 1\n",
    "    print(\"%.2f secs ==> Document-Term Matrix\"%(time.time()-t0))\n",
    "\n",
    "    t0 = time.time()\n",
    "    X = TfidfTransformer().fit_transform(count)\n",
    "    print(\"%.2f secs ==> TF-IDF transform\\n\"%(time.time()-t0))\n",
    "    return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def next_train_batch(X, y):\n",
    "    for i in range(0, X.shape[0], BATCH_SIZE):\n",
    "        yield X[i : i+BATCH_SIZE].toarray(), y[i: i+BATCH_SIZE]\n",
    "\n",
    "def next_test_batch(X):\n",
    "    for i in range(0, X.shape[0], BATCH_SIZE):\n",
    "        yield X[i : i+BATCH_SIZE].toarray()\n",
    "\n",
    "def train_input_fn(X_train, y_train):\n",
    "    dataset = tf.data.Dataset.from_generator(\n",
    "        lambda: next_train_batch(X_train, y_train), (tf.float32, tf.int64),\n",
    "        (tf.TensorShape([None, VOCAB_SIZE]), tf.TensorShape([None])))\n",
    "    iterator = dataset.make_one_shot_iterator()\n",
    "    feats, labels = iterator.get_next()\n",
    "    feats = {'x': feats}\n",
    "    return feats, labels\n",
    "\n",
    "def predict_input_fn(X_test):\n",
    "    dataset = tf.data.Dataset.from_generator(\n",
    "        lambda: next_test_batch(X_test), tf.float32,\n",
    "        tf.TensorShape([None, VOCAB_SIZE]))\n",
    "    iterator = dataset.make_one_shot_iterator()\n",
    "    feats = iterator.get_next()\n",
    "    feats = {'x': feats}\n",
    "    return feats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.99 secs ==> Document-Term Matrix\n",
      "9.95 secs ==> TF-IDF transform\n",
      "\n",
      "3.62 secs ==> Document-Term Matrix\n",
      "9.35 secs ==> TF-IDF transform\n",
      "\n",
      "INFO:tensorflow:Using default config.\n",
      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmp4r396i7w\n",
      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmp4r396i7w', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x114ae1278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n",
      "INFO:tensorflow:Calling model_fn.\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "For now, only bucketized_column is supported but got: _NumericColumn(key='x', shape=(20000,), default_value=None, dtype=tf.float32, normalizer_fn=None)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-5-268ab04b0d8b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_EPOCH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m     \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtrain_input_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m     \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpredict_input_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredict_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'class_ids'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m     \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mkv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'class_ids'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_fn, hooks, steps, max_steps, saving_listeners)\u001b[0m\n\u001b[1;32m    361\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    362\u001b[0m     \u001b[0msaving_listeners\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_check_listeners_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msaving_listeners\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 363\u001b[0;31m     \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_train_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaving_listeners\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    364\u001b[0m     \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Loss for final step: %s.'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    365\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py\u001b[0m in \u001b[0;36m_train_model\u001b[0;34m(self, input_fn, hooks, saving_listeners)\u001b[0m\n\u001b[1;32m    841\u001b[0m       \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_train_model_distributed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaving_listeners\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    842\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 843\u001b[0;31m       \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_train_model_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaving_listeners\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    844\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    845\u001b[0m   \u001b[0;32mdef\u001b[0m \u001b[0m_train_model_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaving_listeners\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py\u001b[0m in \u001b[0;36m_train_model_default\u001b[0;34m(self, input_fn, hooks, saving_listeners)\u001b[0m\n\u001b[1;32m    854\u001b[0m       \u001b[0mworker_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_hooks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    855\u001b[0m       estimator_spec = self._call_model_fn(\n\u001b[0;32m--> 856\u001b[0;31m           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)\n\u001b[0m\u001b[1;32m    857\u001b[0m       return self._train_with_estimator_spec(estimator_spec, worker_hooks,\n\u001b[1;32m    858\u001b[0m                                              \u001b[0mhooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mglobal_step_tensor\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py\u001b[0m in \u001b[0;36m_call_model_fn\u001b[0;34m(self, features, labels, mode, config)\u001b[0m\n\u001b[1;32m    829\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    830\u001b[0m     \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Calling model_fn.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 831\u001b[0;31m     \u001b[0mmodel_fn_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_model_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    832\u001b[0m     \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Done calling model_fn.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    833\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/estimator/canned/boosted_trees.py\u001b[0m in \u001b[0;36m_model_fn\u001b[0;34m(features, labels, mode, config)\u001b[0m\n\u001b[1;32m    696\u001b[0m           \u001b[0mn_batches_per_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    697\u001b[0m           \u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 698\u001b[0;31m           closed_form_grad_and_hess_fn=closed_form)\n\u001b[0m\u001b[1;32m    699\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    700\u001b[0m     super(BoostedTreesClassifier, self).__init__(\n",
      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/estimator/canned/boosted_trees.py\u001b[0m in \u001b[0;36m_bt_model_fn\u001b[0;34m(features, labels, mode, head, feature_columns, tree_hparams, n_batches_per_layer, config, closed_form_grad_and_hess_fn, example_id_column_name, train_in_memory, name)\u001b[0m\n\u001b[1;32m    365\u001b[0m   \u001b[0;31m# tree).\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    366\u001b[0m   \u001b[0mmax_splits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m<<\u001b[0m \u001b[0mtree_hparams\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_depth\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 367\u001b[0;31m   \u001b[0mmax_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_max_buckets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeature_columns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    368\u001b[0m   \u001b[0mtrain_op\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    369\u001b[0m   \u001b[0;32mwith\u001b[0m \u001b[0mops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname_scope\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/estimator/canned/boosted_trees.py\u001b[0m in \u001b[0;36m_get_max_buckets\u001b[0;34m(feature_columns)\u001b[0m\n\u001b[1;32m     74\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     75\u001b[0m       raise ValueError('For now, only bucketized_column is supported but '\n\u001b[0;32m---> 76\u001b[0;31m                        'got: {}'.format(fc))\n\u001b[0m\u001b[1;32m     77\u001b[0m   \u001b[0;32mreturn\u001b[0m \u001b[0mmax_buckets\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: For now, only bucketized_column is supported but got: _NumericColumn(key='x', shape=(20000,), default_value=None, dtype=tf.float32, normalizer_fn=None)"
     ]
    }
   ],
   "source": [
    "(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=VOCAB_SIZE)\n",
    "X_train = sparse_tfidf(X_train)\n",
    "X_test = sparse_tfidf(X_test)\n",
    "\n",
    "numeric_feat = tf.feature_column.numeric_column('x', shape=VOCAB_SIZE)\n",
    "\n",
    "model = tf.estimator.BoostedTreesClassifier(\n",
    "    [numeric_feat], X_train.shape[0]//BATCH_SIZE)\n",
    "\n",
    "for _ in range(N_EPOCH):\n",
    "    model.train(lambda: train_input_fn(X_train, y_train))\n",
    "    y_pred = list(model.predict(lambda: predict_input_fn(X_test), predict_keys='class_ids'))\n",
    "    y_pred = [kv['class_ids'][0] for kv in y_pred]\n",
    "    print(\"\\nValidation Accuracy: %.4f\\n\" % (y_pred==y_test).mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
